1 Processamento dos dados

source("./load_files.R")
data_files_selected <- 
  list.files(
    pattern = ".*carolnovoattila.*csv$",
    recursive = TRUE,
    ignore.case = TRUE
    )

data_files_and_size <- 
  sapply(data_files_selected, file.size)

files_to_include_in_dataframe <- 
  tibble(
    "Files" = names(data_files_and_size),
    "Size (in MB)" = data_files_and_size/1E6
   )

skimr::skim(files_to_include_in_dataframe)
Data summary
Name files_to_include_in_dataf…
Number of rows 2
Number of columns 2
_______________________
Column type frequency:
character 1
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Files 0 1 101 103 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Size (in MB) 0 1 39.85 35.83 14.52 27.19 39.85 52.52 65.19 ▇▁▁▁▇
cdr <- load_cdr(names(data_files_and_size))

cdr %<>% 
  mutate(
    expgroup = case_when(
      str_detect(file, "carol") ~ "carol",
      TRUE ~ "unknown"),
    cycle = case_when(
      str_detect(file, "R3") ~ "R3",
      str_detect(file, "R4") ~ "R4",
      TRUE ~ "unknown"),
    time = case_when(
        str_detect(file, "Initial") ~ "initial",
        str_detect(file, "Final") ~ "final",
        TRUE ~ "unknown")) %>% 
  select(cdr3, cycle, time, expgroup, everything())

cdr %<>% 
  group_by(cdr3, expgroup) %>% 
  arrange(cycle, desc(time), .by_group = TRUE) %>% 
  mutate(
    fcp = cdrp / lag(cdrp, default = first(cdrp)),
    fcq = quantity / lag(quantity, default = first(quantity))
  ) %>% 
  select(cdr3:quantity, fcp, fcq, everything())
cdr %>% 
  filter(time == "final") %>% 
  group_by(expgroup, cycle, time) %>% 
  arrange(desc(fcp)) %>% 
  slice_head(prop = .1) %>% 
  # slice_head(n = 1000) %>% 
  ggplot(aes(expgroup, log10(fcp))) +
    geom_violin(aes(fill = expgroup, color = expgroup), alpha = 0.5) +
    geom_jitter(aes(shape = expgroup), alpha = 0.6, size = 1) +
    stat_summary(
      fun = mean,
      fun.min = mean,
      fun.max = mean,
      geom = "crossbar",
      # width = 0.5,
      aes(color = expgroup)
    ) +
    facet_grid(. ~ cycle)

# cdr %>% 
#   filter(str_detect(cycle, "R0")) %>% 
#   filter(time == "final") %>% 
#   group_by(expgroup, cycle, time) %>% 
#   arrange(desc(fcp)) %>% 
#   slice_head(n = 1000) %>% 
#   ggplot(aes(fcp, color = expgroup, fill = expgroup)) +
#     geom_density(stat = "bin", alpha = 0.3) +
#     facet_grid(cycle ~ expgroup)

cdr %<>% 
  group_by(expgroup, cycle, time) %>% 
  arrange(desc(fcp)) %>% 
  slice_head(prop = .1) %>% 
  mutate(
    threshold = mean(log10(fcp))
    ) %>% 
  mutate(
    rich = if_else(
        (log10(fcp) >= threshold) &
          # (time == "final") &
          # (str_detect(cycle, "R0")),
          (time == "final"),
      "rich",
      "medium")) %>% 
  full_join(cdr) %>% 
  mutate(
    rich = if_else(
      is.na(rich),
      "poor",
      rich)) %>% 
  mutate(
    rich = factor(rich,
                  levels = c("rich", "medium", "poor"))
    ) %>% 
  mutate(
    threshold = if_else(
      is.na(threshold),
      0,
      threshold
    )
  )

cdr %<>% 
  drop_na()

cdr %>% 
  filter(rich == "rich") %>%
  ggplot() +
    geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
    geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
    facet_grid(rich ~ cycle)

cdr %>% 
  filter(!rich == "rich") %>%
  ggplot() +
    geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
    geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
    facet_grid(rich ~ cycle)

names(cdr)
##  [1] "cdr3"      "cycle"     "time"      "expgroup"  "cdrp"      "quantity" 
##  [7] "fcp"       "fcq"       "length"    "MW"        "AV"        "IP"       
## [13] "flex"      "gravy"     "SSF_Helix" "SSF_Turn"  "SSF_Sheet" "n_A"      
## [19] "n_C"       "n_D"       "n_E"       "n_F"       "n_G"       "n_H"      
## [25] "n_I"       "n_K"       "n_L"       "n_M"       "n_N"       "n_P"      
## [31] "n_Q"       "n_R"       "n_S"       "n_T"       "n_V"       "n_W"      
## [37] "n_Y"       "aliphatic" "aromatic"  "neutral"   "positive"  "negative" 
## [43] "invalid"   "file"      "threshold" "rich"
# cdr %>% 
#   filter(rich == "rich") %>% 
#   ungroup() %>% 
#   select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) %>% 
#   ggplot() +
#     geom_density(aes(AV))
# 
# cdr %>% 
#   filter(rich == "poor") %>% 
#   ungroup() %>% 
#   select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) %>% 
#   ggplot() +
#   geom_density(aes(AV))

cdr %>% 
  ggplot() +
    geom_density(aes(AV, color = rich)) +
    facet_grid(cycle ~ .)

cdr %>% 
  ggplot() +
    geom_density(aes(MW, color = rich)) +
    facet_grid(cycle ~ .)

cdr %>% 
  ggplot() +
    geom_density(aes(gravy, color = rich)) +
    facet_grid(cycle ~ .)

cdr %>% 
  ggplot() +
    geom_density(aes(SSF_Turn, color = rich)) +
    facet_grid(cycle ~ .)

cdr %>% 
  ggplot() +
    geom_density(aes(IP, color = rich)) +
    facet_grid(cycle ~ .)

cdr %>% 
  ggplot() +
    geom_density(aes(flex, color = rich)) +
    facet_grid(cycle ~ .)

cdr %>% 
  ggplot(aes(cycle, MW)) +
    geom_violin(aes(fill = rich)) +
    geom_jitter(alpha = .1) +
    facet_grid(rich ~ .)

cdr %>% 
  filter(rich == "rich") %>% 
  ungroup() %>% 
  select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) -> rich66

rich66
## # A tibble: 2,985 x 14
##    cdr3     cdrp quantity   fcp   fcq length    MW     AV    IP  flex   gravy
##    <chr>   <dbl>    <int> <dbl> <dbl>  <int> <dbl>  <dbl> <dbl> <dbl>   <dbl>
##  1 DNSW… 5.26e-4      791  385.  396.      8 1059. 0.375   4.05 0.735 -0.862 
##  2 DRIT… 1.75e-2    26378  216.  222.     11 1222. 0.0909  5.84 0.734 -0.136 
##  3 RMVR… 1.35e-4      203  197.  203       9 1108. 0.111   8.75 0.705  0.0667
##  4 GSSW… 1.32e-4      198  193.  198       9 1055. 0.222   6.00 0.784 -1.79  
##  5 DESH… 6.19e-4      930  181.  186      14 1622. 0.214   4.05 0.773 -1.63  
##  6 SEQR… 3.71e-4      557  181.  186.     10 1169. 0.1     4.05 0.8   -1.73  
##  7 ARTF… 1.23e-4      185  180.  185      10 1207. 0.4     8.63 0.705 -0.570 
##  8 VGRN  1.14e-4      171  166.  171       4  444. 0       9.72 0.767 -1.05  
##  9 GEQW… 1.08e-4      163  158.  163      11 1435. 0.273   6.75 0.724 -0.854 
## 10 ENDD… 1.07e-4      161  157.  161       9 1202. 0.333   4.05 0.754 -1.41  
## # … with 2,975 more rows, and 3 more variables: SSF_Helix <dbl>,
## #   SSF_Turn <dbl>, SSF_Sheet <dbl>
names(cdr)
##  [1] "cdr3"      "cycle"     "time"      "expgroup"  "cdrp"      "quantity" 
##  [7] "fcp"       "fcq"       "length"    "MW"        "AV"        "IP"       
## [13] "flex"      "gravy"     "SSF_Helix" "SSF_Turn"  "SSF_Sheet" "n_A"      
## [19] "n_C"       "n_D"       "n_E"       "n_F"       "n_G"       "n_H"      
## [25] "n_I"       "n_K"       "n_L"       "n_M"       "n_N"       "n_P"      
## [31] "n_Q"       "n_R"       "n_S"       "n_T"       "n_V"       "n_W"      
## [37] "n_Y"       "aliphatic" "aromatic"  "neutral"   "positive"  "negative" 
## [43] "invalid"   "file"      "threshold" "rich"
cdr %>% 
  filter(cycle == "R4") %>% 
  ggplot() +
  geom_violin(aes(rich, gravy, color = rich)) +
  geom_jitter(aes(rich, gravy), alpha = .1)

2 Tentativa de Clusterização

2.1 Clusterization with Quantity and Prevalence variables

library(Rtsne)

set.seed(42)
tsne_df <- cdr %>% 
  filter(time == "final") %>% 
  filter(cycle == "R4") %>% 
  group_by(rich) %>% 
  slice_sample(prop = .1)


tsne_df %>% 
  summarise(across(everything(), ~ all(sum(is.na(.x))))) %>% 
  select(where(isTRUE))
## # A tibble: 3 x 0
tsne_df %>% 
  filter(is.na(threshold)) %>% 
  select(rich, threshold, quantity)
## # A tibble: 0 x 3
## # Groups:   rich [0]
## # … with 3 variables: rich <fct>, threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <- 
  tsne_df %>% 
    ungroup() %>% 
    select(!where(is.character)) %>% 
    select(!c(which(apply(., 2, var)==0))) %>% 
    unique() %>% 
    Rtsne(
      X = .,
      dims = 3,
      perplexity = 30,
      theta = 0.5,
      max_iter = 1E3,
      verbose = T,
      pca_center = T,
      pca_scale = T,
      # partial_pca = T,
      normalize = T,
      eta = 200.0,
      exaggeration_factor = 12.0,
      num_threads = parallel::detectCores() - 2
      )
## Performing PCA
## Read the 6567 x 42 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 8.62 seconds (sparsity = 0.020275)!
## Learning embedding...
## Iteration 50: error is 91.821924 (50 iterations in 10.42 seconds)
## Iteration 100: error is 89.240834 (50 iterations in 9.70 seconds)
## Iteration 150: error is 89.089068 (50 iterations in 8.74 seconds)
## Iteration 200: error is 89.034821 (50 iterations in 9.42 seconds)
## Iteration 250: error is 89.004621 (50 iterations in 10.15 seconds)
## Iteration 300: error is 3.241007 (50 iterations in 8.25 seconds)
## Iteration 350: error is 2.882568 (50 iterations in 7.90 seconds)
## Iteration 400: error is 2.701125 (50 iterations in 8.52 seconds)
## Iteration 450: error is 2.586016 (50 iterations in 8.89 seconds)
## Iteration 500: error is 2.507843 (50 iterations in 9.19 seconds)
## Iteration 550: error is 2.451932 (50 iterations in 9.18 seconds)
## Iteration 600: error is 2.411179 (50 iterations in 9.30 seconds)
## Iteration 650: error is 2.381859 (50 iterations in 9.48 seconds)
## Iteration 700: error is 2.359850 (50 iterations in 9.87 seconds)
## Iteration 750: error is 2.343236 (50 iterations in 8.79 seconds)
## Iteration 800: error is 2.330313 (50 iterations in 9.07 seconds)
## Iteration 850: error is 2.320353 (50 iterations in 9.20 seconds)
## Iteration 900: error is 2.312231 (50 iterations in 9.65 seconds)
## Iteration 950: error is 2.305241 (50 iterations in 9.66 seconds)
## Iteration 1000: error is 2.299074 (50 iterations in 9.82 seconds)
## Fitting performed in 185.21 seconds.
tsne_df %>% 
  ungroup() %>% 
  select(!where(is.character)) %>% 
  select(!c(which(apply(., 2, var)==0))) %>% 
  unique() -> a

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  ggplot() +
    geom_point(aes(V1, V2, color = a$rich))

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  plot_ly(
    title = "Sample title",
    x = .$V1,
    y = .$V2,
    z = .$V3,
    type = "scatter3d",
    mode = "markers",
    color = a$rich
  ) %>% 
  layout(title = "With Quantity Variables")

2.2 Clusterization only with Biochemistry Properties

library(Rtsne)

set.seed(42)
tsne_df <- cdr %>% 
  # filter(!rich == "poor") %>%
  filter(time == "final") %>% 
  filter(cycle == "R4") %>% 
  group_by(rich) %>% 
  slice_sample(prop = .1)

tsne_df %>% 
  summarise(across(everything(), ~ all(sum(is.na(.x))))) %>% 
  select(where(isTRUE))
## # A tibble: 3 x 0
tsne_df %>% 
  filter(is.na(threshold)) %>% 
  select(rich, threshold, quantity)
## # A tibble: 0 x 3
## # Groups:   rich [0]
## # … with 3 variables: rich <fct>, threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <- 
  tsne_df %>% 
    ungroup() %>% 
    select(cdr3, !where(is.character)) %>% 
    select(!c(which(apply(., 2, var)==0))) %>% 
    select(!c("quantity", "cdrp", "fcp", "fcq", "rich", "threshold")) %>% 
    mutate(cdr3 = as.factor(cdr3)) %>% 
    Rtsne(
      X = .,
      dims = 3,
      perplexity = 30,
      theta = 0.5,
      max_iter = 1E3,
      verbose = T,
      pca_center = T,
      pca_scale = T,
      normalize = T,
      partial_pca = T,
      eta = 200.0,
      exaggeration_factor = 12.0,
      num_threads = parallel::detectCores()
      )
## Performing PCA
## Read the 6575 x 50 data matrix successfully!
## OpenMP is working. 8 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 11.38 seconds (sparsity = 0.021765)!
## Learning embedding...
## Iteration 50: error is 90.625826 (50 iterations in 19.62 seconds)
## Iteration 100: error is 90.162903 (50 iterations in 23.18 seconds)
## Iteration 150: error is 90.132397 (50 iterations in 21.53 seconds)
## Iteration 200: error is 90.123344 (50 iterations in 21.65 seconds)
## Iteration 250: error is 90.119227 (50 iterations in 20.40 seconds)
## Iteration 300: error is 3.859909 (50 iterations in 17.87 seconds)
## Iteration 350: error is 3.456857 (50 iterations in 12.45 seconds)
## Iteration 400: error is 3.272986 (50 iterations in 14.23 seconds)
## Iteration 450: error is 3.161528 (50 iterations in 13.90 seconds)
## Iteration 500: error is 3.085411 (50 iterations in 13.63 seconds)
## Iteration 550: error is 3.031465 (50 iterations in 15.05 seconds)
## Iteration 600: error is 2.991372 (50 iterations in 14.24 seconds)
## Iteration 650: error is 2.960647 (50 iterations in 15.34 seconds)
## Iteration 700: error is 2.937113 (50 iterations in 14.57 seconds)
## Iteration 750: error is 2.918461 (50 iterations in 14.45 seconds)
## Iteration 800: error is 2.904167 (50 iterations in 15.21 seconds)
## Iteration 850: error is 2.892766 (50 iterations in 14.53 seconds)
## Iteration 900: error is 2.884979 (50 iterations in 15.84 seconds)
## Iteration 950: error is 2.878619 (50 iterations in 14.62 seconds)
## Iteration 1000: error is 2.873402 (50 iterations in 15.27 seconds)
## Fitting performed in 327.57 seconds.
tsne_df %>% 
  ungroup() %>% 
  select(cdr3, !where(is.character)) %>% 
  select(!c(which(apply(., 2, var)==0))) %>% 
  select(!c("quantity", "cdrp", "fcp", "fcq", "threshold")) -> a

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  ggplot() +
    geom_point(aes(V1, V2, color = a$rich))

tsne_out %>% 
  .$Y %>% 
  as_tibble() %>% 
  plot_ly(
    x = .$V1,
    y = .$V2,
    z = .$V3,
    type = "scatter3d",
    mode = "markers",
    color = a$rich
  ) %>% 
  layout(title = "Without Quantity Variables")